library(here) # for reproducible paths
library(SingleCellExperiment)
library(scater) # For qc
library(org.Mm.eg.db) # To annotate the genenames
sce <- readRDS(here("processed", "sce.RDS"))
The object has 32285 genes and 113757 cells before filtering
First we need to sort the gene names and gene symbols, because the default ensembl notation is not very handy. And then save the mitochondrial genes as such.
if(!file.exists(here("processed", "sce_preliminary.RDS"))){
# obtain full genenames
genename <- mapIds(org.Mm.eg.db, keys = rownames(sce), keytype = "ENSEMBL", column = c("GENENAME"))
# Use the symbols as rownames
# first make gene names unique
# TODO: save duplicate gene name list
symb_unique <- uniquifyFeatureNames(rownames(sce), rowData(sce)[, "Symbol"])
# Now they can be used as rownames
rownames(sce) <- symb_unique
# Add full gene names and the uniuqe symbols to the rowdata
rowData(sce)$symb_uniq <- symb_unique
rowData(sce)$gene_name <- genename
# Subset the mitochondrial genes
is_mito <- grepl("^mt-", rownames(sce))
}else{
sce <- readRDS(here("processed", "sce_preliminary.RDS"))
}
Then we can use the scater package to add the quality per cell. This computes for each cell some usefull metrics such as the number of umi counts (library size), the number of detected genes and the percentage of mitochondiral genes.
Then we use the automatic isOutlier function from the same package that determine which values in a numeric vector are outliers based on the median absolute deviation (MAD). When using this function with low number a log transformation is added, that prevents negative thresholds.
if(!file.exists(here("processed", "sce_preliminary.RDS"))){
sce <- addPerCellQC(sce, subsets = list(mt=is_mito)) }
# Automated outlier detection
outlier_lib_low <- isOutlier(sce$total, log = TRUE, type = "lower")
outlier_expr_low <-
isOutlier(sce$detected, log = TRUE, type = "lower")
outlier_lib_high <- isOutlier(sce$total, type = "higher")
outlier_expr_high <-
isOutlier(sce$detected, type = "higher")
outlier_mt <- isOutlier(sce$subsets_mt_percent, type = "higher")
# total
outlier <-
outlier_lib_low |
outlier_expr_low |
outlier_lib_high | outlier_expr_high | outlier_mt
# See how many cells are detected as outliers
DataFrame(
lib_size_high = sum(outlier_lib_high),
expression_high = sum(outlier_expr_high),
lib_size_low = sum(outlier_lib_low),
expression_low = sum(outlier_expr_low),
mt_pct = sum(outlier_mt),
total = sum(outlier)
)
## DataFrame with 1 row and 6 columns
## lib_size_high expression_high lib_size_low expression_low mt_pct total
## <integer> <integer> <integer> <integer> <integer> <integer>
## 1 4059 170 0 2063 18262 24086
# And the thresholds
attr(outlier_lib_high, "thresholds")
## lower higher
## -Inf 18374.92
attr(outlier_expr_high, "thresholds")
## lower higher
## -Inf 6876.803
attr(outlier_lib_low, "thresholds")
## lower higher
## 307.5535 Inf
attr(outlier_expr_low, "thresholds")
## lower higher
## 230.9287 Inf
attr(outlier_mt, "thresholds")
## lower higher
## -Inf 17.54768
# Add if it is an outlier to the metadata
sce$outlier <- outlier
Diagnostic plots to visualize the data distribution. The orange cells are marked as outliers by the automatic detection from scater. These thresholds will probably be manually adjusted to better fit the distribution of our data.
plotColData(sce, x="Sample", y="sum", colour_by="outlier") +
ggtitle("Total count") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
plotColData(sce, x="Sample", y="sum", colour_by="outlier") +
scale_y_log10() + ggtitle("Total count log scale") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
plotColData(sce, x="Sample", y="detected", colour_by="outlier") +
scale_y_log10() + ggtitle("Detected Genes") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
plotColData(sce, x="Sample", y="sum", colour_by="chip") +
scale_y_log10() + ggtitle("total count by batch") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
plotColData(sce, x="Sample", y="subsets_mt_percent", colour_by="outlier") + ggtitle("Mitocchondrial percentatge") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
In the x axis we can see the total number of umi (library size) per cell, the number of detected genes per cell and the mitochondrial percentage per cell; with the number of cells for each measure in the y axis.
hist(
sce$total,
breaks = 100
)
This object had already been filtrated with the cell-calling algorithm from CellRanger, that is meant to remove empty droplets. Therefore it is expected to see the total sum of umi skewed as in the plot above.
hist(
sce$detected,
breaks = 100
)
A bimodal plot is not something usual for the detected number of genes. This bimodality was already visible in the violin plots.
hist(
sce$subsets_mt_percent,
breaks = 100
)
There is a very heavy tail of cells with high mitochondrial genes.
plotColData(sce, x="sum", y="subsets_mt_percent", colour_by="outlier")
plotColData(sce, x="sum", y="detected", colour_by="outlier")
plotColData(sce, x="sum", y="detected", colour_by="Sample")
Here we run a PCA using the information in the metadata instead of the gene expression. It is useful to visualize the QC parametres.
if(!file.exists(here("processed", "sce_preliminary.RDS"))){
sce <- runColDataPCA(sce, variables = c("sum","detected", "subsets_mt_percent"))
}
plotReducedDim(sce, dimred="PCA_coldata", colour_by = "Sample")
sce$chip <- as.character(sce$chip)
plotReducedDim(sce, dimred="PCA_coldata", colour_by = "chip")
This measures the number of detected genes per cell dividided by its lirary size. This will be very useful to get rid of the cells that have low gene counts but a relatively high umi count.
if(!file.exists(here("processed", "sce_preliminary.RDS"))){
sce$ratio_detected_sum <- sce$detected/sce$sum
sce$outlier_ratio <- isOutlier(sce$ratio_detected_sum, type = "both")
}
summary(sce$ratio_detected_sum)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.005707 0.326323 0.395047 0.404313 0.476884 0.878004
plotColData(sce, x="sum", y="detected", colour_by="ratio_detected_sum")
# Plot an histogram with the ratio between umi and gene counts
hist(
sce$ratio_detected_sum,
breaks = 100
)
The isoutlier function did not work very well with the sum of umi and the detected genes. This is probably due to the fact that this function expects roughly a normal distribution. Bellow we use it with the ratio between these two parametres, that do follow a normal distribution.
# Use the is outlier function from scater to see the cutoffs suggestions
plotColData(sce, x="sum", y="detected", colour_by="outlier_ratio")
attr(sce$outlier_ratio, "thresholds")
## lower higher
## 0.0649170 0.7251773
The suggestions here are very reasonable, it is not enough to delete all the cells we do not trust but we can definitely take this into consideration as a threshold parameter.
For the genes detected and sum of genes the upper thresholds automatically generated are not very convincing.These thresholds allow to filter doublet cells, but they seem way too harsh on those estimattions. We could start with 50,000 for the number of umi and 7500 for the number of detected genes.
The lower thresholds allow to remove empty droplets and bad quality cells. For these ones we could use the automatic ones, as they seem more reasonable (even though they might be a bit too permissive it is fine for a first filer). These are deleting cells with less than 230.9286948 detected genes and 307.5534791 umi counts.
Finally we will also take in consideration the umi counts/detected genes ratio, that filter out cells with relatively high umi counts but very few detected genes ( lots of copies from the same genes?). As well as the automatic mitochondrial threshold, 17.5476816, this is a bit high compared with other datasets but it is already deleting 18262 cells. Lower down to 10 %, to delete the whole tail would delete 32348 cells.
Here we code the new thersholds
if(!file.exists(here("processed", "sce_preliminary.RDS"))){
# New thresholds
discard_lib_high <- sce$sum > 50000
discard_expr_high <- sce$detected > 7500
discard_expr_low <- outlier_expr_low
discard_lib_low <- outlier_lib_low
discard_ratio <- sce$outlier_ratio
discard_mt <- outlier_mt
sce$discard <- discard_lib_high | discard_expr_high | discard_expr_low | discard_lib_low | discard_ratio | discard_mt
# See how many cells are detected as outliers
discard <- DataFrame(
lib_size_high = sum(discard_lib_high),
expression_high = sum(discard_expr_high),
lib_size_low = sum(discard_lib_low),
expression_low = sum(discard_expr_low),
mt_pct = sum(outlier_mt),
ratio_umi_genes = sum(discard_ratio),
total = sum(sce$discard)
)
write.csv(discard, here("outs", "reason_for_discard.csv"))
# Save the object at this stage with the label "preliminary analysis"
# As only annotation addition has been done, without deleting anything yet.
saveRDS(sce, here("processed", "sce_preliminary.RDS"))}else{
discard <- read.csv(here("outs", "reason_for_discard.csv"))
}
discard
## X lib_size_high expression_high lib_size_low expression_low mt_pct
## 1 1 47 69 0 2063 18262
## ratio_umi_genes total
## 1 2458 21142
Bellow we are going to plot the same plots as before with the decided thresholds
plotColData(sce, x="Sample", y="sum", colour_by="discard") +
ggtitle("Total count") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
plotColData(sce, x="Sample", y="sum", colour_by="discard") +
scale_y_log10() + ggtitle("Total count log scale") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
plotColData(sce, x="Sample", y="detected", colour_by="discard") +
scale_y_log10() + ggtitle("Detected Genes") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
plotColData(sce, x="Sample", y="subsets_mt_percent", colour_by="discard") + ggtitle("Mitocchondrial percentatge") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
plotColData(sce, x="sum", y="subsets_mt_percent", colour_by="discard")
plotColData(sce, x="sum", y="detected", colour_by="discard")
It is typically a good idea to remove genes whose expression level is considered “undetectable”. Here we define a gene as detectable if at least two cells contain a transcript from the gene. It is important to keep in mind that genes must be filtered after cell filtering since some genes may only be detected in poor quality cells.
genes_beforeqc <- dim(sce)[1]
keep_feature <- rowSums(counts(sce) > 0) > 1
sce <- sce[keep_feature, ]
genes_beforeqc -dim(sce)[1]
## [1] 7785
This way we deleted 7785 genes and kept 24500 genes
We can look at a plot that shows the top 50 (by default) most-expressed features. Each row in the plot below corresponds to a gene; each bar corresponds to the expression of a gene in a single cell; and the circle indicates the median expression of each gene, with which genes are sorted. We expect to see the “usual suspects”, i.e., mitochondrial genes, actin, ribosomal protein, MALAT1. A large number of pseudo-genes or predicted genes may indicate problems with alignment.
plotHighestExprs(sce, exprs_values = "counts", colour_cells_by = "age")
The young ones are not visible on the low side, but this is just due to the fact that first is plotted one color, and then the other on top, covering the color bellow if there are dots on the same place.
if(!file.exists(here("processed", "sce_QC.RDS"))) {
sce <- sce[, sce$discard == FALSE]
saveRDS(sce, here("processed", "sce_QC.RDS"))
}
The object has 24500 genes and 113757 cells after filtering
sessionInfo()
## R version 4.0.4 (2021-02-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] org.Mm.eg.db_3.12.0 AnnotationDbi_1.52.0
## [3] scater_1.18.6 ggplot2_3.3.3
## [5] SingleCellExperiment_1.12.0 SummarizedExperiment_1.20.0
## [7] Biobase_2.50.0 GenomicRanges_1.42.0
## [9] GenomeInfoDb_1.26.2 IRanges_2.24.1
## [11] S4Vectors_0.28.1 BiocGenerics_0.36.0
## [13] MatrixGenerics_1.2.1 matrixStats_0.58.0
## [15] here_1.0.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.6 rsvd_1.0.3
## [3] lattice_0.20-41 rprojroot_2.0.2
## [5] digest_0.6.27 utf8_1.1.4
## [7] R6_2.5.0 RSQLite_2.2.3
## [9] evaluate_0.14 highr_0.8
## [11] pillar_1.5.1 sparseMatrixStats_1.2.1
## [13] zlibbioc_1.36.0 rlang_0.4.10
## [15] irlba_2.3.3 blob_1.2.1
## [17] Matrix_1.3-2 rmarkdown_2.7
## [19] labeling_0.4.2 BiocNeighbors_1.8.2
## [21] BiocParallel_1.24.1 stringr_1.4.0
## [23] bit_4.0.4 RCurl_1.98-1.2
## [25] munsell_0.5.0 beachmat_2.6.4
## [27] DelayedArray_0.16.2 compiler_4.0.4
## [29] vipor_0.4.5 BiocSingular_1.6.0
## [31] xfun_0.21 pkgconfig_2.0.3
## [33] ggbeeswarm_0.6.0 htmltools_0.5.1.1
## [35] tibble_3.1.0 gridExtra_2.3
## [37] GenomeInfoDbData_1.2.4 fansi_0.4.2
## [39] viridisLite_0.3.0 crayon_1.4.1
## [41] withr_2.4.1 bitops_1.0-6
## [43] grid_4.0.4 gtable_0.3.0
## [45] lifecycle_1.0.0 DBI_1.1.1
## [47] magrittr_2.0.1 scales_1.1.1
## [49] cachem_1.0.4 stringi_1.5.3
## [51] scuttle_1.0.4 farver_2.1.0
## [53] XVector_0.30.0 viridis_0.5.1
## [55] DelayedMatrixStats_1.12.3 ellipsis_0.3.1
## [57] vctrs_0.3.6 tools_4.0.4
## [59] bit64_4.0.5 glue_1.4.2
## [61] beeswarm_0.3.1 fastmap_1.1.0
## [63] yaml_2.2.1 colorspace_2.0-0
## [65] memoise_2.0.0 knitr_1.31